Loading Packages

library(data.table) # Efficient Dataframe 
library(lubridate) # For Dates 
library(tidyverse) # Multiple Package for Useful Data wrangling
library(esquisse) # Intuitive plotting
library(plyr) # Data splitting
library(dplyr) # Data Wrangling
library(ggplot2) # Plot Graphs
library(naniar) # for NA exploration in Dataframe
library(plotly) # Make ggplot2 Dynamic
library(gridExtra) # Multiple Plot at once
library(RColorBrewer) # For Color Palette
library(rmdformats) # Theme of HTML
library(flextable) # Show Table
library(class) # K-NN
library(summarytools) # Beautiful and Efficient Summary for Dataset
library(pivottabler) # Pivot Table
library(naivebayes) # Naive Bayes Function
library(caret) # Confusion Matrix
library(leaps) # Exhaustive Search
library(forecast) # Predictions
library(neuralnet) # Neural Network
library(nnet) # Neural Network
library(manipulateWidget) # Plotly Combiner
library(rpart) # Regression Tree
library(rpart.plot) # Plotting Regression Tree
library(gains) # Lift Chart
library(adabag) # Boosting and Bagging
library(FactoMineR) # Clustering
library(factoextra) # Clustering
library(NbClust) # Optimal Clustering
library(qacr) # Clusters Profiling | install with: remotes::install_github("rkabacoff/qacr")
library(dendextend) # Cluster Dendrogram

Dataset Preparation

Loading all dataset: Cereals.csv, eBayAuctions.csv and UniversalBank.csv

Cereals <- fread("data/Cereals.csv")
eBay <- fread("data/eBayAuctions.csv")
UniversalBank <- fread("data/UniversalBank.csv")

Cereals.csv

Quick Preview

# Preview of the Dataset
DT::datatable(head(Cereals,2))

TABLE 4.8 - Breakfast Cereals Dataset

dfSummary(Cereals, 
          plain.ascii  = FALSE, 
          style        = "grid", 
          graph.magnif = 0.75, 
          valid.col    = FALSE,
          tmp.img.dir  = "/tmp")

Data Frame Summary

Cereals

Dimensions: 77 x 16
Duplicates: 0

No Variable Stats / Values Freqs (% of Valid) Graph Missing
1 name
[character]
1. 100%_Bran
2. 100%_Natural_Bran
3. All-Bran
4. All-Bran_with_Extra_Fiber
5. Almond_Delight
6. Apple_Cinnamon_Cheerios
7. Apple_Jacks
8. Basic_4
9. Bran_Chex
10. Bran_Flakes
[ 67 others ]
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
1 ( 1.3%)
67 (87.0%)
0
(0.0%)
2 mfr
[character]
1. A
2. G
3. K
4. N
5. P
6. Q
7. R
1 ( 1.3%)
22 (28.6%)
23 (29.9%)
6 ( 7.8%)
9 (11.7%)
8 (10.4%)
8 (10.4%)
0
(0.0%)
3 type
[character]
1. C
2. H
74 (96.1%)
3 ( 3.9%)
0
(0.0%)
4 calories
[integer]
Mean (sd) : 106.9 (19.5)
min < med < max:
50 < 110 < 160
IQR (CV) : 10 (0.2)
11 distinct values 0
(0.0%)
5 protein
[integer]
Mean (sd) : 2.5 (1.1)
min < med < max:
1 < 3 < 6
IQR (CV) : 1 (0.4)
1 : 13 (16.9%)
2 : 25 (32.5%)
3 : 28 (36.4%)
4 : 8 (10.4%)
5 : 1 ( 1.3%)
6 : 2 ( 2.6%)
0
(0.0%)
6 fat
[integer]
Mean (sd) : 1 (1)
min < med < max:
0 < 1 < 5
IQR (CV) : 2 (1)
0 : 27 (35.1%)
1 : 30 (39.0%)
2 : 14 (18.2%)
3 : 5 ( 6.5%)
5 : 1 ( 1.3%)
0
(0.0%)
7 sodium
[integer]
Mean (sd) : 159.7 (83.8)
min < med < max:
0 < 180 < 320
IQR (CV) : 80 (0.5)
27 distinct values 0
(0.0%)
8 fiber
[numeric]
Mean (sd) : 2.2 (2.4)
min < med < max:
0 < 2 < 14
IQR (CV) : 2 (1.1)
13 distinct values 0
(0.0%)
9 carbo
[numeric]
Mean (sd) : 14.8 (3.9)
min < med < max:
5 < 14.5 < 23
IQR (CV) : 5 (0.3)
21 distinct values 1
(1.3%)
10 sugars
[integer]
Mean (sd) : 7 (4.4)
min < med < max:
0 < 7 < 15
IQR (CV) : 8 (0.6)
16 distinct values 1
(1.3%)
11 potass
[integer]
Mean (sd) : 98.7 (70.4)
min < med < max:
15 < 90 < 330
IQR (CV) : 77.5 (0.7)
35 distinct values 2
(2.6%)
12 vitamins
[integer]
Mean (sd) : 28.2 (22.3)
min < med < max:
0 < 25 < 100
IQR (CV) : 0 (0.8)
0 : 8 (10.4%)
25 : 63 (81.8%)
100 : 6 ( 7.8%)
0
(0.0%)
13 shelf
[integer]
Mean (sd) : 2.2 (0.8)
min < med < max:
1 < 2 < 3
IQR (CV) : 2 (0.4)
1 : 20 (26.0%)
2 : 21 (27.3%)
3 : 36 (46.8%)
0
(0.0%)
14 weight
[numeric]
Mean (sd) : 1 (0.2)
min < med < max:
0.5 < 1 < 1.5
IQR (CV) : 0 (0.1)
0.50 : 2 ( 2.6%)
0.83 : 1 ( 1.3%)
1.00 : 64 (83.1%)
1.25 : 2 ( 2.6%)
1.30 : 1 ( 1.3%)
1.33 : 5 ( 6.5%)
1.50 : 2 ( 2.6%)
0
(0.0%)
15 cups
[numeric]
Mean (sd) : 0.8 (0.2)
min < med < max:
0.2 < 0.8 < 1.5
IQR (CV) : 0.3 (0.3)
12 distinct values 0
(0.0%)
16 rating
[numeric]
Mean (sd) : 42.7 (14)
min < med < max:
18 < 40.4 < 93.7
IQR (CV) : 17.7 (0.3)
77 distinct values 0
(0.0%)

We can see that Potass tend to be right skewed with Fiber as well. Rating as some outliers in the far right and weight seems to be very centralized as well as Calories, having far outliers at both sides.

Missing Variables Plot

# Missing Variables Plot for the Dataset
gg_miss_var(Cereals, show_pct = TRUE)

# NA % per variables
mean(is.na(Cereals$potass))
mean(is.na(Cereals$sugars))
mean(is.na(Cereals$carbo))

# NA % for whole dataseet
mean(is.na(Cereals))

Cereals.csv has some missing values, such as Potass (2.6%), Sugars (1.3%) and Carbo (1.3%). The dataset has a proportion of 0.32% missing values.

eBayAuction.csv

Quick Preview

# Preview of the Dataset
DT::datatable(head(eBay,2))
dfSummary(eBay, 
          plain.ascii  = FALSE, 
          style        = "grid", 
          graph.magnif = 0.75, 
          valid.col    = FALSE,
          tmp.img.dir  = "/tmp")

Data Frame Summary

eBay

Dimensions: 1972 x 8
Duplicates: 346

No Variable Stats / Values Freqs (% of Valid) Graph Missing
1 Category
[character]
1. Music/Movie/Game
2. Collectibles
3. Toys/Hobbies
4. Automotive
5. Antique/Art/Craft
6. SportingGoods
7. Clothing/Accessories
8. Home/Garden
9. Jewelry
10. Health/Beauty
[ 8 others ]
403 (20.4%)
239 (12.1%)
234 (11.9%)
178 ( 9.0%)
177 ( 9.0%)
124 ( 6.3%)
119 ( 6.0%)
102 ( 5.2%)
82 ( 4.2%)
64 ( 3.2%)
250 (12.7%)
0
(0.0%)
2 currency
[character]
1. EUR
2. GBP
3. US
533 (27.0%)
147 ( 7.5%)
1292 (65.5%)
0
(0.0%)
3 sellerRating
[integer]
Mean (sd) : 3560.2 (5973)
min < med < max:
0 < 1853 < 37727
IQR (CV) : 2785 (1.7)
461 distinct values 0
(0.0%)
4 Duration
[integer]
Mean (sd) : 6.5 (2.1)
min < med < max:
1 < 7 < 10
IQR (CV) : 2 (0.3)
1 : 23 ( 1.2%)
3 : 213 (10.8%)
5 : 466 (23.6%)
7 : 967 (49.0%)
10 : 303 (15.4%)
0
(0.0%)
5 endDay
[character]
1. Fri
2. Mon
3. Sat
4. Sun
5. Thu
6. Tue
7. Wed
287 (14.6%)
548 (27.8%)
351 (17.8%)
338 (17.1%)
202 (10.2%)
171 ( 8.7%)
75 ( 3.8%)
0
(0.0%)
6 ClosePrice
[numeric]
Mean (sd) : 36.4 (89.5)
min < med < max:
0 < 10 < 999
IQR (CV) : 23.1 (2.5)
827 distinct values 0
(0.0%)
7 OpenPrice
[numeric]
Mean (sd) : 12.9 (38.9)
min < med < max:
0 < 4.5 < 999
IQR (CV) : 8.8 (3)
291 distinct values 0
(0.0%)
8 Competitive?
[integer]
Min : 0
Mean : 0.5
Max : 1
0 : 906 (45.9%)
1 : 1066 (54.1%)
0
(0.0%)

We can see that sellerRating tend to have a lot of right outliers. Duration tend to be left skewed with endDay as well (having more endDay on Monday for example.)

Missing Variables Plot

# Missing Variables Plot for the Dataset
gg_miss_var(eBay, show_pct = TRUE)

eBayAuction.csv has no missing value.

UniversalBank.csv

Quick Preview

# Preview of the Dataset
DT::datatable(head(UniversalBank,2))
dfSummary(UniversalBank, 
          plain.ascii  = FALSE, 
          style        = "grid", 
          graph.magnif = 0.75, 
          valid.col    = FALSE,
          tmp.img.dir  = "/tmp")

Data Frame Summary

UniversalBank

Dimensions: 5000 x 14
Duplicates: 0

No Variable Stats / Values Freqs (% of Valid) Graph Missing
1 ID
[integer]
Mean (sd) : 2500.5 (1443.5)
min < med < max:
1 < 2500.5 < 5000
IQR (CV) : 2499.5 (0.6)
5000 distinct values
(Integer sequence)
0
(0.0%)
2 Age
[integer]
Mean (sd) : 45.3 (11.5)
min < med < max:
23 < 45 < 67
IQR (CV) : 20 (0.3)
45 distinct values 0
(0.0%)
3 Experience
[integer]
Mean (sd) : 20.1 (11.5)
min < med < max:
-3 < 20 < 43
IQR (CV) : 20 (0.6)
47 distinct values 0
(0.0%)
4 Income
[integer]
Mean (sd) : 73.8 (46)
min < med < max:
8 < 64 < 224
IQR (CV) : 59 (0.6)
162 distinct values 0
(0.0%)
5 ZIP Code
[integer]
Mean (sd) : 93152.5 (2121.9)
min < med < max:
9307 < 93437 < 96651
IQR (CV) : 2697 (0)
467 distinct values 0
(0.0%)
6 Family
[integer]
Mean (sd) : 2.4 (1.1)
min < med < max:
1 < 2 < 4
IQR (CV) : 2 (0.5)
1 : 1472 (29.4%)
2 : 1296 (25.9%)
3 : 1010 (20.2%)
4 : 1222 (24.4%)
0
(0.0%)
7 CCAvg
[numeric]
Mean (sd) : 1.9 (1.7)
min < med < max:
0 < 1.5 < 10
IQR (CV) : 1.8 (0.9)
108 distinct values 0
(0.0%)
8 Education
[integer]
Mean (sd) : 1.9 (0.8)
min < med < max:
1 < 2 < 3
IQR (CV) : 2 (0.4)
1 : 2096 (41.9%)
2 : 1403 (28.1%)
3 : 1501 (30.0%)
0
(0.0%)
9 Mortgage
[integer]
Mean (sd) : 56.5 (101.7)
min < med < max:
0 < 0 < 635
IQR (CV) : 101 (1.8)
347 distinct values 0
(0.0%)
10 Personal Loan
[integer]
Min : 0
Mean : 0.1
Max : 1
0 : 4520 (90.4%)
1 : 480 ( 9.6%)
0
(0.0%)
11 Securities Account
[integer]
Min : 0
Mean : 0.1
Max : 1
0 : 4478 (89.6%)
1 : 522 (10.4%)
0
(0.0%)
12 CD Account
[integer]
Min : 0
Mean : 0.1
Max : 1
0 : 4698 (94.0%)
1 : 302 ( 6.0%)
0
(0.0%)
13 Online
[integer]
Min : 0
Mean : 0.6
Max : 1
0 : 2016 (40.3%)
1 : 2984 (59.7%)
0
(0.0%)
14 CreditCard
[integer]
Min : 0
Mean : 0.3
Max : 1
0 : 3530 (70.6%)
1 : 1470 (29.4%)
0
(0.0%)

We can see that Experience has some negative values in its range, thus questionning the way we should interpret Experience depending on the reference point. In our case lacking information from this dataset leave us keeping those variables as such. In a very different scenario, knowing that the survey could offer negative experience values would oblige us to dig in further the data collection and survey processes and question how accurate are the values in our dataset and how we collected them. We can see that a lot of variables are very right skewed (CCAvg, Income or Mortgage).

Missing Variables Plot

# Missing Variables Plot for the Dataset
gg_miss_var(UniversalBank, show_pct = TRUE)

UniversalBank.csv has no missing value.

Ex 13.1

Acceptance of Consumer Loan

# Loading Universal.csv
UniversalBank <- fread("data/UniversalBank.csv")

# Removing ZIP Code and ID
UniversalBank <- UniversalBank[,-c("ZIP Code","ID")]

Partition the data: 60% training, 40% validation.

# Setting Seed
set.seed(1)

# Training and Validation Proportion
Training_Proportion <- 0.6
Validation_Proportion <- 1-Training_Proportion

# Splitting
sample <- sample(c(TRUE, FALSE), nrow(UniversalBank), replace=TRUE, prob=c(Training_Proportion,Validation_Proportion))

UB_Training  <- UniversalBank[sample, ]

UB_Validation   <- UniversalBank[!sample, ]

# Checking Proportions
Training_Proportion_Check <- nrow(UB_Training)/nrow(UniversalBank)
Validation_Proportion_Check <- nrow(UB_Validation)/nrow(UniversalBank)

# Printing Result Check
print(paste("Proportion in Training is", Training_Proportion_Check*100,"%", "and in Validation is",Validation_Proportion_Check*100,"%"))
FALSE [1] "Proportion in Training is 61 % and in Validation is 39 %"

a. Fit Models to the data for (1) logistic regression, (2) k-nearest neighbors with k = 3, and (3) Classification trees.

Use Personal Loan as the outcome variable. Report the validation confusion matrix for each of the three models.

Logistic Regression

# Set Seed 
set.seed(1)

# Duplicate the UniversalBank
LR_Training <- UB_Training
LR_Validation <- UB_Validation

# Target Variable As Factor
LR_Training$`Personal Loan` <- factor(LR_Training$`Personal Loan`, levels = c(0,1),labels = c("No Loan","Loan")) 
LR_Validation$`Personal Loan` <- factor(LR_Validation$`Personal Loan`, levels = c(0,1),labels = c("No Loan","Loan"))

# Factor Education
LR_Training$Education <- factor(LR_Training$Education, levels = c(1, 2, 3), labels = c("Undergrad", "Graduate", "Advanced/Professional"))
LR_Validation$Education <- factor(LR_Validation$Education, levels = c(1, 2, 3), labels = c("Undergrad", "Graduate", "Advanced/Professional"))

# Run logistic regression
logit.reg <- glm(`Personal Loan` ~ ., data = LR_Training, family = "binomial")
options(scipen=999)

# Give the Summary of the logistic Regression
summary(logit.reg)
FALSE 
FALSE Call:
FALSE glm(formula = `Personal Loan` ~ ., family = "binomial", data = LR_Training)
FALSE 
FALSE Deviance Residuals: 
FALSE     Min       1Q   Median       3Q      Max  
FALSE -2.1853  -0.1818  -0.0672  -0.0212   4.1974  
FALSE 
FALSE Coefficients:
FALSE                                   Estimate  Std. Error z value
FALSE (Intercept)                    -10.3177782   2.5770778  -4.004
FALSE Age                             -0.1295287   0.0971009  -1.334
FALSE Experience                       0.1462520   0.0961873   1.520
FALSE Income                           0.0589502   0.0038257  15.409
FALSE Family                           0.6028245   0.1013175   5.950
FALSE CCAvg                            0.2018200   0.0586504   3.441
FALSE EducationGraduate                3.9264743   0.3520393  11.154
FALSE EducationAdvanced/Professional   4.0665622   0.3382094  12.024
FALSE Mortgage                         0.0012640   0.0007709   1.640
FALSE `Securities Account`            -0.5440472   0.3724749  -1.461
FALSE `CD Account`                     3.7374014   0.4324751   8.642
FALSE Online                          -0.5841282   0.2188906  -2.669
FALSE CreditCard                      -0.8865331   0.2775008  -3.195
FALSE                                            Pr(>|z|)    
FALSE (Intercept)                           0.00006236641 ***
FALSE Age                                        0.182217    
FALSE Experience                                 0.128387    
FALSE Income                         < 0.0000000000000002 ***
FALSE Family                                0.00000000268 ***
FALSE CCAvg                                      0.000579 ***
FALSE EducationGraduate              < 0.0000000000000002 ***
FALSE EducationAdvanced/Professional < 0.0000000000000002 ***
FALSE Mortgage                                   0.101073    
FALSE `Securities Account`                       0.144118    
FALSE `CD Account`                   < 0.0000000000000002 ***
FALSE Online                                     0.007617 ** 
FALSE CreditCard                                 0.001400 ** 
FALSE ---
FALSE Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
FALSE 
FALSE (Dispersion parameter for binomial family taken to be 1)
FALSE 
FALSE     Null deviance: 1947.62  on 3049  degrees of freedom
FALSE Residual deviance:  686.36  on 3037  degrees of freedom
FALSE AIC: 712.36
FALSE 
FALSE Number of Fisher Scoring iterations: 8

Comments: We can see that Mortgage, Securities Account, Age and Experience are not statistically significant in this logistic regression. Income, EducationGraduate and EducationAdvanced/Professional with CD Account seems to be really significant in this model with a very low p-value. EducationAdvanced/Professional, EducationGraduate and CD Account seems to have the most positive impact in wether or not a individual may contract a Loan or not. One possibility would be to use different method for variable selections in this logistic regression (that would certainly remove some unsignificant variables) or even rescale variable by log in case of some skewed distributions. In this case removing variables would certainly improve the model by adding more weight to the most important and statistically significant variables and thus the predictions. In our case, we only mention the possibility but we do not go through those steps, since we are trying to compare all variables predictors to each of the models without optimizing it and to have a fair comparison.

Predictions and Confusion Matrix

# Set Seed 
set.seed(1)

# Predictions with LR
logit.reg.pred <- predict(logit.reg, LR_Validation[, -c("`Personal Loan`")], type = "response")

# Rounding Predictions - 0.5 Threshold
logit.reg.pred_round <- round(logit.reg.pred)

# As Numeric
logit.reg.pred_round <- as.numeric(logit.reg.pred_round)

# Check rounding
dataframe_predictions <- cbind(logit.reg.pred, logit.reg.pred_round)

# Predictions as Factor
logit.reg.pred_round <- factor(logit.reg.pred_round, levels = c(0,1),labels = c("No Loan","Loan"))

# Confusion Matrix
Confusion_Matrix_LR <- confusionMatrix(data = logit.reg.pred_round, reference = LR_Validation$`Personal Loan`)

# Create the Function for Confusion Matrix
draw_confusion_matrix_LR <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Logistic Regression', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No Loan', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Loan', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No Loan', cex=1.2, srt=90)
  text(140, 335, 'Loan', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_LR(Confusion_Matrix_LR)

Comments: The Confusion Matrix shows us an Accuracy of 0.949 with the Logistic Regression Model, but we have a very bad specificity, meaning that the model fails to give us good prediction for the True Negative, which are here Loan, thus our main goal in the predictive model.

K-nearest neighbors with k = 3

Data Preprocessing

# Set Seed 
set.seed(1)

# Target Variable As Factor
UB_Training$`Personal Loan` <- factor(UB_Training$`Personal Loan`, levels = c(0,1),labels = c("No Loan","Loan")) 

UB_Validation$`Personal Loan` <- factor(UB_Validation$`Personal Loan`, levels = c(0,1),labels = c("No Loan","Loan"))

# Education as Factor
UB_Training$Education <- factor(UB_Training$Education)

UB_Validation$Education <- factor(UB_Validation$Education)

# Library Caret
library(caret)

# Dummy for Education
dummy <- dummyVars(" ~ Education", data=UB_Training)
Education_Dummy <- data.frame(predict(dummy, newdata = UB_Training)) 

dummy_2 <- dummyVars(" ~ Education", data=UB_Validation)
Education_Dummy_2 <- data.frame(predict(dummy, newdata = UB_Validation)) 

# Remove Education and add hot-encoded dummies 
UB_Training <- UB_Training[,-c("Education")]
UB_Training <- cbind(UB_Training,Education_Dummy)

UB_Validation <- UB_Validation[,-c("Education")]
UB_Validation <- cbind(UB_Validation,Education_Dummy_2)

# Duplicate Dataset for Preprocess
UB_Training_Preprocess <- UB_Training
UB_Validation_Preprocess <- UB_Validation

# Remove Dummies
UB_Training_Preprocess <- UB_Training_Preprocess[,-c("Personal Loan","Online","CreditCard","Securities Account","CD Account","Education.1","Education.2","Education.3")]

# Preprocess 
norm_values <- preProcess(UB_Training_Preprocess, method = c("center","scale"))

UB_Training_Preprocess <- predict(norm_values,UB_Training)
UB_Validation_Preprocess <- predict(norm_values, UB_Validation)

Predictions and Confusion Matrix

# Set Seed 
set.seed(1)

# Packages
library(class)
library(caret)

# Choosing our K value
k <- 3

# KNN
NEW_KNN3 <- knn3(y = UB_Training_Preprocess$`Personal Loan`, x=UB_Training_Preprocess[,-c("Personal Loan")], k=k)

# KNN Probabilities output
Predictions_KNN_Training <- predict(NEW_KNN3, UB_Validation_Preprocess[,-c("Personal Loan")], type = "class")
Predictions_KNN_Training_Proba <- predict(NEW_KNN3, UB_Validation_Preprocess[,-c("Personal Loan")], type = "prob")[,2]

# Confusion Matrix
Confusion_Matrix_KNN <- confusionMatrix(data = Predictions_KNN_Training, reference = UB_Validation$`Personal Loan`)

# Create the Function for Confusion Matrix
draw_confusion_matrix_KNN <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for KNN with k=3', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No Loan', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Loan', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No Loan', cex=1.2, srt=90)
  text(140, 335, 'Loan', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_KNN(Confusion_Matrix_KNN)

Comments: KNN Confusion Matrix shows us a slightly better accuracy than Regression Model, with Accuracy of 0.969 and a better Specificity of 0.683.

Classification Trees

# Set Seed 
set.seed(1)

# Tree Packages
library(rpart)
library(rpart.plot)

# Running Tree
CT <- rpart(UB_Training$`Personal Loan` ~ . , data=UB_Training)

# Plotting Tree
CT_Plot <- rpart.plot(CT, type=0, varlen = 0, box.col=ifelse(CT$frame$var == "<leaf>", '#8db0aa', 'white'), fallen.leaves = FALSE, extra = FALSE, main="Classification Trees", cex.main=1.5)

Comments: We can see that Classification Trees give us very different predictor variables. Income, CCAvg and Education.1 give the main split in the Trees, then CD Account, Family and Income again split further more to the main terminal nodes until CCAvg fo the 4th split.

Predictions and Confusion Matrix

# Set Seed 
set.seed(1)

# Predictions 
CT_Predictions <- predict(CT, UB_Validation, type = 'class')
CT_Predictions_Proba <- predict(CT, UB_Validation, type = 'prob')

# Confusion Matrix
Confusion_Matrix_CT <- confusionMatrix(data = CT_Predictions, reference = UB_Validation$`Personal Loan`)

# Create the Function for Confusion Matrix
draw_confusion_matrix_CT <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Classification Trees', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No Loan', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Loan', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No Loan', cex=1.2, srt=90)
  text(140, 335, 'Loan', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_CT(Confusion_Matrix_CT)

Comments: Classification Trees Confusion Matrix give us a way better look at the overall improvement in Specificity and Accuracy. Specifity is now 0.836 way better than previously and Accuracy raise up to 0.982, thus very close to an alpha error of 0.01.

Most Accurate Model

Accuracy_CT <- Confusion_Matrix_CT$overall[1]

Accuracy_KNN <- Confusion_Matrix_KNN$overall[1]

Accuracy_LR <- Confusion_Matrix_LR$overall[1]

Accuracy_DF <- data.frame(Accuracy_CT,Accuracy_KNN,Accuracy_LR)

Accuracy_DF <- as.data.frame(Accuracy_DF)

# Plotting Accuracy
data.table(Accuracy_DF)
FALSE    Accuracy_CT Accuracy_KNN Accuracy_LR
FALSE 1:   0.9815385    0.9692308   0.9487179

Comments: The most accurate model is the Classification Trees, having at least 98.1% accurate predictions (Training against Validation), the lowest accuracy model is the Logistic Regression with 94.8% Accuracy.

b. Create Dataframe with the actual outcome, predicted outcome, and each of the tree models

# Creating Actual VERSUS Predicted Dataframe
LR_Dataframe <- data.frame(logit.reg.pred_round)
KNN_Dataframe <- data.frame(Predictions_KNN_Training)
CT_Dataframe <- data.frame(CT_Predictions)

All_Models_Predictions <- cbind(LR_Dataframe,KNN_Dataframe,CT_Dataframe, UB_Validation$`Personal Loan`)

# Changing Columns Name
colnames(All_Models_Predictions)[1] <- "Predicted - LR"
colnames(All_Models_Predictions)[2] <- "Predicted - KNN"
colnames(All_Models_Predictions)[3] <- "Predicted - CT"
colnames(All_Models_Predictions)[4] <- "Actual"

# Showing Dataframes 
DT::datatable(All_Models_Predictions, caption = "Predicted VS Actual Loan in Validation - 3 Models") 

c. Add two columns, a majority vote predicted of outcome and the average of predited probabilities. Derive a confusion matrix for each method and report the overall accuracy.

Majority Vote Column

# Majority Column
All_Models_Predictions$Majority <- 0

# As Numeric LR
All_Models_Predictions$`Predicted - LR` <- as.numeric(All_Models_Predictions$`Predicted - LR`)
All_Models_Predictions$`Predicted - LR` <- All_Models_Predictions$`Predicted - LR` -1

# As Numeric KNN
All_Models_Predictions$`Predicted - KNN` <- as.numeric(All_Models_Predictions$`Predicted - KNN`)
All_Models_Predictions$`Predicted - KNN` <- All_Models_Predictions$`Predicted - KNN` -1

# As Numeric CT
All_Models_Predictions$`Predicted - CT` <- as.numeric(All_Models_Predictions$`Predicted - CT`)
All_Models_Predictions$`Predicted - CT` <- All_Models_Predictions$`Predicted - CT` -1

# As Numeric Actual
All_Models_Predictions$Actual <- as.numeric(All_Models_Predictions$Actual)
All_Models_Predictions$Actual <- All_Models_Predictions$Actual -1

# Majority Vote
for(i in 1:nrow(All_Models_Predictions)){
  if(sum(All_Models_Predictions[i,1] + All_Models_Predictions[i,2] + All_Models_Predictions[i,3])/3 > 0.5){
    All_Models_Predictions[i,5] = 1
    }
}

# As Factor again
All_Models_Predictions$`Predicted - LR` <- factor(All_Models_Predictions$`Predicted - LR`, levels = c(0,1),labels = c("No Loan","Loan"))
All_Models_Predictions$`Predicted - KNN` <- factor(All_Models_Predictions$`Predicted - KNN`, levels = c(0,1),labels = c("No Loan","Loan"))
All_Models_Predictions$`Predicted - CT` <- factor(All_Models_Predictions$`Predicted - CT`, levels = c(0,1),labels = c("No Loan","Loan"))
All_Models_Predictions$Actual <- factor(All_Models_Predictions$Actual, levels = c(0,1),labels = c("No Loan","Loan"))
All_Models_Predictions$Majority <- factor(All_Models_Predictions$Majority, levels = c(0,1),labels = c("No Loan","Loan"))

# Show Majority Column
DT::datatable(All_Models_Predictions, caption = "Predicted VS Actual Loan in Validation (Majority Vote Added) - 3 Models") 

Confusion Matrix with the Majority Vote

# Confusion Matrix
Confusion_Matrix_Majority <- confusionMatrix(data = All_Models_Predictions$Majority, reference = All_Models_Predictions$Actual)

# Create the Function for Confusion Matrix
draw_confusion_matrix_Majority <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Majority of Votes', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No Loan', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Loan', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No Loan', cex=1.2, srt=90)
  text(140, 335, 'Loan', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_Majority(Confusion_Matrix_Majority)

Comments: The Majority Vote has an accuracy of 97.2%, which is slighty better than RT and KNN but less accurate than CT. The Specificity decreased here compared to CT, since we have 0.71.

Average of Predicted Probabilities

All_Models_Predictions$Average <- NA

# Average Vote
for(i in 1:nrow(All_Models_Predictions)){
  All_Models_Predictions$Average[i] =  (CT_Predictions_Proba[i,2] + logit.reg.pred[i] + Predictions_KNN_Training_Proba[i])/3
}

# Round Average Vote to 4th Decimals
All_Models_Predictions$Average <- round(All_Models_Predictions[,6],4)

# Show Average Column with Probabilities
DT::datatable(All_Models_Predictions, caption = "Predicted VS Actual Loan in Validation (Average Probabilities Added) - 3 Models") 
# Change Probabilities to 0 or 1 with cutoff = 0.5
for(i in 1:nrow(All_Models_Predictions)){
  if(All_Models_Predictions[i,6] > 0.5){
    All_Models_Predictions[i,6] = 1
  }
  else{ 
    All_Models_Predictions[i,6] = 0
    }
}

# As Factor Average
All_Models_Predictions$Average <- factor(All_Models_Predictions$Average, levels = c(0,1),labels = c("No Loan","Loan"))

# Show Average Column with Probabilities
DT::datatable(All_Models_Predictions, caption = "Predicted VS Actual Loan in Validation (Average Added) - 3 Models") 

Confusion Matrix with the Average Predicted Probabilities

# Confusion Matrix
Confusion_Matrix_Average <- confusionMatrix(data = All_Models_Predictions$Average, reference = All_Models_Predictions$Actual)

# Create the Function for Confusion Matrix
draw_confusion_matrix_Average <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Average Predicted Probabilities', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No Loan', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Loan', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No Loan', cex=1.2, srt=90)
  text(140, 335, 'Loan', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_Average(Confusion_Matrix_Average)

Comments: Averaging Probabilities of predictions give us a very slightly, by 0.4% improvement with the Specificity that is slightly better with 4.4% improvement.

d. Compare Error Rates for the 3 individuals methods and the 2 ensemble methods.

# Computing Error Rates

# Individual Methods
Error_CT <- 1-Accuracy_CT
Error_KNN <- 1-Accuracy_KNN
Error_LR <- 1-Accuracy_LR

# Ensembles Methods
Error_Majority <- 1-Confusion_Matrix_Majority$overall[1]
Error_Average <- 1-Confusion_Matrix_Average $overall[1]

# As Data frame 
Error_Rates_DF <- cbind(Error_LR,Error_KNN,Error_CT,Error_Majority,Error_Average)

# Rounding up to 5 decimals
Error_Rates_DF <- round(Error_Rates_DF,3)

DT::datatable(Error_Rates_DF, caption = "Error Rates from 3 Individual Methods and 2 Ensembles Methods") 

Comments: CT is the one with the smallest error rate followed by Average Method and Majority Method.

Ex 13.2

eBay Auctions - Boosting and Bagging

# Loading eBay Auctions
eBayAuction_DF <- fread("data/eBayAuctions.csv")

Partition the data: 60% training, 40% validation.

# Setting Seed
set.seed(1)

# Training and Validation Proportion
Training_Proportion <- 0.6
Validation_Proportion <- 1-Training_Proportion

# Splitting
sample <- sample(c(TRUE, FALSE), nrow(eBayAuction_DF), replace=TRUE, prob=c(Training_Proportion,Validation_Proportion))

EB_Training  <- eBayAuction_DF[sample, ]

EB_Validation   <- eBayAuction_DF[!sample, ]

# Checking Proportions
Training_Proportion_Check <- nrow(EB_Training)/nrow(eBayAuction_DF)
Validation_Proportion_Check <- nrow(EB_Validation)/nrow(eBayAuction_DF)

# Printing Result Check
print(paste("Proportion in Training is", round(Training_Proportion_Check*100),"%", "and in Validation is",round(Validation_Proportion_Check*100),"%"))
FALSE [1] "Proportion in Training is 61 % and in Validation is 39 %"

a. Run a Classification Tree, using default controls of rpart()

Actual VS Predictions

# Set Seed 
set.seed(1)

# Tree Packages
library(rpart)
library(rpart.plot)

# Duplicate before Factor
EB_Training_Pure <- EB_Training
EB_Validation_Pure <- EB_Validation

# Factor for outcome variable 
EB_Training$`Competitive?` <- factor(EB_Training$`Competitive?`, levels = c(0,1),labels = c("No","Yes")) 
EB_Validation$`Competitive?` <- factor(EB_Validation$`Competitive?`, levels = c(0,1),labels = c("No","Yes")) 

# Running Classification Tree
CT <- rpart(EB_Training$`Competitive?` ~ . , data=EB_Training)

# Predict for Validation Set with CT
CT_Predictions <- predict(CT, EB_Validation, type = 'class')
CT_Predictions_Prob <- predict(CT, EB_Validation, type = 'prob')

# Check Actual VS Predicted
Predictions_Actual_CT <- data.frame(EB_Validation$`Competitive?`,CT_Predictions)

# Renaming Columns
colnames(Predictions_Actual_CT)[2] <- "Predicted"
colnames(Predictions_Actual_CT)[1] <- "Actual"

# Render Table
DT::datatable(Predictions_Actual_CT, caption = "Actual VS Predicted with Default Classification Trees") 

Confusion Matrix

# Set seed
set.seed(1)

# Confusion Matrix
Confusion_Matrix_CT_2 <- confusionMatrix(data = CT_Predictions, reference = EB_Validation$`Competitive?`)

# Create the Function for Confusion Matrix
draw_confusion_matrix_CT_2 <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Default Classification Trees', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Yes', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No', cex=1.2, srt=90)
  text(140, 335, 'Yes', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_CT_2(Confusion_Matrix_CT_2)

Comments: The Overall Accuracy is 0.81 for this Default Classification Trees with a specificity of 0.716 and sensitivity of 0.915. The Model is better at True Positive than True Negative.

Lift Chart

# Set seed
set.seed(1)

# Load Gains
library(gains)

# Compute Gains Chart
gain <- gains(as.numeric(as.character(EB_Validation_Pure$`Competitive?`)), CT_Predictions_Prob[,2], groups = 10, ties.method=c("first"))
barplot(gain$mean.resp / mean(as.numeric(as.character(EB_Validation_Pure$`Competitive?`))), names.arg = gain$depth, xlab = "Percentile",
ylab = "Mean Response", main = "Decile-wise lift chart", col = "#1c6155")

# Top Decile
library(lift)
print(paste("Top decile lift by ordering the data by the predictions, and computing the proportion of positives in the top 10%.","Here the top decile is",TopDecileLift(CT_Predictions_Prob[,2], as.numeric(as.character(EB_Validation_Pure$`Competitive?`)))))
FALSE [1] "Top decile lift by ordering the data by the predictions, and computing the proportion of positives in the top 10%. Here the top decile is 1.774"

Comments: Our first decile is having a mean-response of 1.774. We can see that our first decile show us that our model is doing almost twice as better as a base random model. Then it quikly fall off from the 5th deciles and so on.

b. Run a boosted tree with the same predictors

# Set seed
set.seed(1)

library(adabag)

# Duplicate Dataset for Boosting
EB_Training_Factor <- EB_Training

# Factor Before Boosting
EB_Training_Factor$Category <- factor(EB_Training_Factor$Category)
EB_Training_Factor$currency <- factor(EB_Training_Factor$currency)
EB_Training_Factor$endDay <- factor(EB_Training_Factor$endDay)


# Predictions 
boost <- boosting(`Competitive?` ~ ., data = EB_Training_Factor) 

Prediction_Boost <- predict(boost, EB_Validation, type="class")

Prediction_Boost_Factor <- as.factor(Prediction_Boost$class)

# Confusion Matrix
Confusion_Matrix_BOOST <- confusionMatrix(Prediction_Boost_Factor, EB_Validation$`Competitive?`)

# Create the Function for Confusion Matrix
draw_confusion_matrix_BOOST <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Boosted Trees', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Yes', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No', cex=1.2, srt=90)
  text(140, 335, 'Yes', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_BOOST(Confusion_Matrix_BOOST)

Comments: The Overall Accuracy is 0.886 with a balanced Sensitivity and Specificity, both less than 0.9. The Specificity is better with Boosted Tree than Default Classification Trees.

Lift Chart

# Set seed
set.seed(1)

# Load Gains
library(gains)

# Compute Gains Chart
gain_1 <- gains(as.numeric(as.character(EB_Validation_Pure$`Competitive?`)), Prediction_Boost$prob[,2], groups = 10, ties.method=c("first"))
barplot(gain_1$mean.resp / mean(as.numeric(as.character(EB_Validation_Pure$`Competitive?`))), names.arg = gain_1$depth, xlab = "Percentile",
ylab = "Mean Response", main = "Decile-wise lift chart", col = "#1c6155")

# Top Decile
library(lift)
print(paste("Top decile lift by ordering the data by the predictions, and computing the proportion of positives in the top 10%.","Here the top decile is",TopDecileLift(Prediction_Boost$prob[,2], as.numeric(as.character(EB_Validation_Pure$`Competitive?`)))))
FALSE [1] "Top decile lift by ordering the data by the predictions, and computing the proportion of positives in the top 10%. Here the top decile is 1.897"

Comments: Our first decile is having a mean-response of 1.897. We can see that our first decile show us that our model is doing almost twice as better as a base random model. Then it quikly fall off from the 6th deciles and so on.

c. Run a bagged tree with the same predictors

# Set seed
set.seed(1)

# Load Package
library(adabag)

# Predictions
bag <- bagging(`Competitive?` ~ ., data = EB_Training) 

Prediction_Bagging <- predict(bag, EB_Validation, type="class")

Prediction_Bagging_Factor <- as.factor(Prediction_Bagging$class)

# Confusion Matrix
Confusion_Matrix_BT <- confusionMatrix(Prediction_Bagging_Factor, EB_Validation$`Competitive?`)

# Create the Function for Confusion Matrix
draw_confusion_matrix_BT <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Bagged Trees', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Yes', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No', cex=1.2, srt=90)
  text(140, 335, 'Yes', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_BT(Confusion_Matrix_BT)

Comments: The Overall Accuracy is 0.853 for this Bagged Tree. The Sensitivity is larger than the Specificity such as in the Default Classification Trees and our Specificity is lower than the boosted Trees model.

Lift Chart

# Set seed
set.seed(1)

# Load Gains
library(gains)

# Compute Gains Chart
gain_2 <- gains(as.numeric(as.character(EB_Validation_Pure$`Competitive?`)), Prediction_Bagging$prob[,2],groups = 10, ties.method=c("first"))
barplot(gain_2$mean.resp / mean(as.numeric(as.character(EB_Validation_Pure$`Competitive?`))), names.arg = gain_2$depth, xlab = "Percentile",
ylab = "Mean Response", main = "Decile-wise lift chart", col = "#1c6155")

# Top Decile
library(lift)
print(paste("Top decile lift by ordering the data by the predictions, and computing the proportion of positives in the top 10%.","Here the top decile is",TopDecileLift(Prediction_Bagging$prob[,2], as.numeric(as.character(EB_Validation_Pure$`Competitive?`)))))
FALSE [1] "Top decile lift by ordering the data by the predictions, and computing the proportion of positives in the top 10%. Here the top decile is 1.897"

Comments: Our first decile is having a mean-response of 1.897. This Model is very, almost likely to give the same lift as the previous model.

d. Run a random forest with argument mtry = 4

# Set seed
set.seed(1)

# Load Package
library(randomForest)

# Predictions
Random_Forest <- randomForest(EB_Training$`Competitive?` ~ ., data = EB_Training, mtry = 4)

Prediction_RF <- predict(Random_Forest, EB_Validation, type="class")

Prediction_RF_Prob <- predict(Random_Forest, EB_Validation, type="prob")

# Confusion Matrix
Confusion_Matrix_RF <- confusionMatrix(data = Prediction_RF, reference = EB_Validation$`Competitive?`)

# Create the Function for Confusion Matrix
draw_confusion_matrix_RF <- function(cm) {

  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX for Random Forests', cex.main=2)

  # create the matrix 
  rect(150, 430, 240, 370, col='#1c6155')
  text(195, 435, 'No', cex=1.2)
  rect(250, 430, 340, 370, col='#1c615570')
  text(295, 435, 'Yes', cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#1c615570')
  rect(250, 305, 340, 365, col='#1c6155')
  text(140, 400, 'No', cex=1.2, srt=90)
  text(140, 335, 'Yes', cex=1.2, srt=90)

  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')

  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)

  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

# Plot the Confusion Matrix
draw_confusion_matrix_RF(Confusion_Matrix_RF)

Comments: The Overall Accuracy is 0.877 for this Random Forest, with a high Specificity but balanced as well with Sensitivity. The Specificity is also higher than previous model (apart from boosted trees).

Lift Chart

# Set seed
set.seed(1)

# Load Gains
library(gains)

# Compute Gains Chart
gain_3 <- gains(as.numeric(as.character(EB_Validation_Pure$`Competitive?`)), Prediction_RF_Prob[,2], groups = 10, ties.method=c("first"))
barplot(gain_3$mean.resp / mean(as.numeric(as.character(EB_Validation_Pure$`Competitive?`))), names.arg = gain_3$depth, xlab = "Percentile",
ylab = "Mean Response", main = "Decile-wise lift chart", col = "#1c6155")

# Top Decile
library(lift)
print(paste("Top decile lift by ordering the data by the predictions, and computing the proportion of positives in the top 10%.","Here the top decile is",TopDecileLift(Prediction_RF_Prob[,2], as.numeric(as.character(EB_Validation_Pure$`Competitive?`)))))
FALSE [1] "Top decile lift by ordering the data by the predictions, and computing the proportion of positives in the top 10%. Here the top decile is 1.897"

Comments: Our first decile is having a mean-response of 1.897. This Model is very similar and give the same lift as the previous model.

Compare the bagged tree to the random forest in terms of validation accuracy and lift on first decile. How are the two methods conceptually different?

We can see that the bagged tree and the random forest are quite similar in accuracy: 0.853 VS 0.877. We can see that the first decile on both are the same. The big difference in Random Forest is that it only subset of features that are selected at random are used for the best split in each node in a tree, oppposite to bagging where all features are considered for splitting a node.

Ex 15.3

Customer Rating of Breakfast Cereals

Data Preprocessing

Remove all cereals with missing values

# Import Dataset Cereals.csv
Cereals <- fread("data/Cereals.csv")

Count_Before <- as.numeric(nrow(Cereals))

# NA Omit Cereals
Cereals <- na.omit(Cereals)

Count_After <- as.numeric(nrow(Cereals))

Count_NA <- Count_Before - Count_After

print(paste("We removed", Count_NA, "rows containing NA value"))
FALSE [1] "We removed 3 rows containing NA value"

a. Apply hierarchical clustering to the data using Euclidian distance to the normalized measurements

# Setting Seed
set.seed(1)

# Library
library(cluster)
library(dendextend)

# Processing Dataset for Hclust
Scaling <- preProcess(Cereals, method = c("center","scale"))
Cereals_Preprocess <- predict(Scaling, Cereals)

# Computing Distance - Euclidean
Distance_Euclidian <- dist(Cereals_Preprocess, method = "euclidean")

# Computing Hierarchical Cluster - Single Linkage --------------
hc1 <- hclust(Distance_Euclidian, method = "single")

## Plotting
plot(hc1, hang = -1, ann = TRUE, labels=Cereals_Preprocess$name, cex=0.5, main="Cluster Dendrogram for Single Method", sub="with 5 Clusters")

# Choosing K
k1 <- 5
rect.hclust(hc1, k = k1, border = 2:5)

# Computing Hierarchical Cluster - Complete Linkage --------------
hc2 <- hclust(Distance_Euclidian, method = "complete")

## Plotting
plot(hc2, hang = -1, ann = TRUE, labels=Cereals_Preprocess$name, cex=0.5, main="Cluster Dendrogram for Complete Method",sub="with 5 Clusters")
rect.hclust(hc2, k = k1, border = 2:5)

Comments: Single Linkage tend to overfit some cereals to the red right cluster while Complete linkage give a bit more say to the classification to other clusters, such as in the far left and far right red clusters..

Centroid for Single Linkage with 5 Clusters

## Centroid Computation 
clusters1 <- cutree(hc1, k=k1)
cluster_center1 = aggregate(Cereals_Preprocess[,4:16],list(cluster=clusters1),mean)
DT::datatable(round(cluster_center1,2), caption = "Centroid Single Linkage")

Centroid for Complete Linkage with 5 Clusters

## Centroid Computation 
clusters2 <- cutree(hc2, k=k1)
cluster_center2 = aggregate(Cereals_Preprocess[,4:16],list(cluster=clusters2),mean)
DT::datatable(round(cluster_center2,2), caption = "Centroid Complete Linkage")

b. Which method leads to the most insightful or meaningful clusters ?

We can see that both method could lead to insightful and meaningful clusters depending on our main goals. Some Cluster may variate in their centroid due to the method applied and thus we may lose some insights by not choosing the criterion before-hand with some knowledge in the domain. We may want to take the method which separate furthest apart clusters and are more dense (internal validity) but also if we already now some domain knowledge in the field, we may recognizse some clusters that are already there in real life, thus using external validity can lead to choose a different method. Also Cross-Validity can lead to choose a different method if one can generate better cluster in other dataset and thus not overfit the training data to much. Let’s plot their profiles.

Profile of Clusters in Singe Linkage

# Required Github Package
library(qacr)

# Computing Centroid Profiles for Single Linkage Method
(profile_plot(cluster_center1, cluster = "cluster", type = c("bar")))

Comments: We can see that Cluster 1 and Cluster 4 are very distinct to each other while Cluster 2 and 5 Tend to variate in the variables. Cluster 3 tend to overfit 1 particular variable which is shelf as well as cluster 2.

Profile of Clusters in Complete Linkage

# Required Github Package
library(qacr)

# Computing Centroid Profiles for Single Linkage Method
(profile_plot(cluster_center2, cluster = "cluster", type = c("bar")))

Comments: The Complete Method in Clustering give us the fairly same clusters such as Cluster 1, but one main differences is how much 1 variable is going to be overfitted in 1 cluster. We can see that Cluster 3 and 4 are less dominated by 1 variable and all clusters are way smoother in the clustering in terms of scale. This kind of profile show us that complete method would give us more interesting clusters with more variables in each, but still giving good insight in separating them.

c. Choose one of the methods. How many clusters would you use? What distance is used for this cutoff?

Finding Optimal Clusters Number using Height

# Set seed
set.seed(1)

# Plotting
plot(hc2, hang = -1, ann = TRUE, labels=Cereals_Preprocess$name, cex=0.5, main="Cluster Dendrogram for Complete Method",sub="without Clusters")

# Height of Cluster in a Barplot
barplot(hc2$height,
  names.arg = (nrow(Cereals_Preprocess) - 1):1 # show the number of cluster below each bars
, xlab = "Clusters", ylab= "Height", main="Barplot of Clusters Height")

sort(hc2$height)
FALSE  [1]  0.1588111  0.2176169  0.6374037  0.7743277  0.9186464  1.0023901
FALSE  [7]  1.1132685  1.1138973  1.3322529  1.3348896  1.3909291  1.5284398
FALSE [13]  1.5623945  1.5761682  1.6125920  1.6233407  1.6351767  1.6833179
FALSE [19]  1.7834452  1.7868094  1.7925264  1.8022347  1.8309541  1.8715635
FALSE [25]  1.8774737  1.9084187  1.9197754  1.9912448  2.0401005  2.1039908
FALSE [31]  2.1286544  2.1989416  2.2358790  2.2701389  2.4440248  2.4668483
FALSE [37]  2.5947319  2.6419096  2.6559139  2.7981865  2.8434385  2.8559988
FALSE [43]  2.8613725  2.9602357  2.9753628  3.0330627  3.0801456  3.0916742
FALSE [49]  3.5825359  3.5902690  3.7552927  3.8281921  3.8940363  3.9219027
FALSE [55]  4.1234852  4.2893775  4.3903035  4.4427693  4.4720495  4.6235399
FALSE [61]  4.9431336  5.3016864  5.3680562  5.9260647  6.0883158  6.5675874
FALSE [67]  6.8434264  7.3982073  7.4676249  8.4865020  8.8350502 11.0703451
FALSE [73] 12.1855307

Comments: To find the optimal Cluster Number from our Dendrogram, we would compute the height largest variation between a cluster merge. Then having a Barplot help us finding a range to compare, that would be the 3 or 4 clusters hierarchy. By checking Cluster Height and sorting it, we can then compute line on the Dendrogram to precisely get the differences of height after a merge and look up which Cluster number to choose from.

# Checking Clusters
plot(hc2, hang = -1, ann = TRUE, labels=Cereals_Preprocess$name, cex=0.5, main="Cluster Dendrogram for Complete Method",sub="without Clusters")

abline(h=8.4865020 , col="#1A79A7")
abline(h=7.4676249 , col="#1A79A7")
legend(8.8, legend="Delta = 1.02", bty = "n", text.col = "#1A79A7", cex = 0.7)

abline(h=11.0703451 , col="#1c6155")
abline(h= 8.8350502 , col="#1c6155")
legend(11, legend="Delta = 2.24", bty = "n", text.col = "#1c6155", cex = 0.7)

Comments: To get our optimal Cluster number, we would get the maximum distance of height in the Dendrogram, that lead us to a k=3 since the largest delta is 2.24.

Final Cluster Number

## Plotting with 3 Clusters
plot(hc2, hang = -1, ann = TRUE, labels=Cereals_Preprocess$name, cex=0.5, main="Cluster Dendrogram for Complete Method",sub="with 3 Clusters")
rect.hclust(hc2, k = 3, border = 2:5)

Comments: This would be the most optimal cluster number with the complete linkage method on hierarchial clustering.

d. Elementary public schools

Find Cluster of “healthy cereals”

Healthy Cereals Should have Low Calories, Low Sugars, High Proteins and High Fibers according to pickyeaterblog.com

# Set Seed
set.seed(1)

# Centroid Computation for k=3 and Complete Linkage
clusters3 <- cutree(hc2, k=3)
cluster_center3 = aggregate(Cereals_Preprocess[,4:16],list(cluster=clusters3),mean)
DT::datatable(round(cluster_center3,2), caption = "Centroid Complete Linkage with k=3")

Comments: We can see that Cluster 1 Look mostly like the best cluster in terms of averages, having low calories, high protein, high fiber and low sugars (not the lowest but still low). In bonus, the rating is also in favor of this cluster.

## Plotting with 3 Clusters
plot(hc2, hang = -1, ann = TRUE, labels=Cereals_Preprocess$name, cex=0.5, main="Cluster Dendrogram for Complete Method",sub="with 3 Clusters")
rect.hclust(hc2, k = 3, border = 2:5)

# Cluster Members
split1 <- split(Cereals, clusters3)

cluster_1 <- split1$`1`
cluster_2 <- split1$`2`
cluster_3 <- split1$`3`

DT::datatable(cluster_1, caption = "Cluster 1 - Considered Healthy")
DT::datatable(cluster_2, caption = "Cluster 2")
DT::datatable(cluster_3, caption = "Cluster 3" )

Comments: We can see that first cluster contains 3 different healthy cereals, but perhaps variety should be a bit more in focus since we are asked for different cereals everyday, what happens if we cluster by 4 instead of 3 ?

# Set Seed
set.seed(1)

# Centroid Computation for k=3 and Complete Linkage
clusters4 <- cutree(hc2, k=4)
cluster_center4 = aggregate(Cereals_Preprocess[,4:16],list(cluster=clusters4),mean)
DT::datatable(round(cluster_center4,2), caption = "Centroid Complete Linkage with k=4" )

Comments: First Cluster is having low calories, high protein, low fat and high fiber. The rating is also quite high. There is also Cluster 4 which is having low calories, low protein, second place for fiber, low sugars and high rating. Let’s see if this 4th cluster give us a bit more cereals for the public school.

## Plotting with 4 Clusters
plot(hc2, hang = -1, ann = TRUE, labels=Cereals_Preprocess$name, cex=0.5, main="Cluster Dendrogram for Complete Method",sub="with 4 Clusters")
rect.hclust(hc2, k = 4, border = 2:5)

# Cluster Members
split2 <- split(Cereals, clusters4)

cluster_2_1 <- split2$`1`
cluster_2_2 <- split2$`2`
cluster_2_3 <- split2$`3`
cluster_2_4 <- split2$`4`

DT::datatable(cluster_2_1, caption = "Cluster 1 - Considered Healthy" )
DT::datatable(cluster_2_2, caption = "Cluster 2" )
DT::datatable(cluster_2_3, caption = "Cluster 3" )
DT::datatable(cluster_2_4, caption = "Cluster 4 - Considered Healthy" )

Comments: First Cluster give us the same product, and cluster 4 give us 9 more cereals that could be considered healthy, we could choose among the best healthy to complete the breakfast of the public school without having to compromise variety. But for this exercice, the official answer would be cluster 1 in our cluster method with k=3.

Should be the data normalized?

For the hierarchical clustering, we should normalize the dataset since we are dealing with euclidean distances in the process of clustering. Nevertheless, for interpretability, we should get comprehensive data of each members of our clusters, we would use the non-preprocess dataset for so and reajust the number of clusters accordingly to our goal.